################################################################################ 
#
# The distribution of hate speech and its implications for content moderation
# PSRM - Replication package
# Figure D6
#
################################################################################ 

library(dplyr)
library(tidyr)
library(readr)
library(ggplot2)
library(scales)

rm(list = setdiff(ls(), ls(pattern = "^wd|^setsave$")))

y_limit_vec <- c(.20)
break_list <- list(c(0.00,0.05,0.10,0.15,0.20))

df_ch <- readRDS(file = paste0(wd_data,"/medium_twitter_llm_vs_emnlp/tweets_for_llm_classified_anon.rds"))
df_media_a <- readRDS(file = paste0(wd_data,"/medium_twitter_llm_vs_emnlp/sample_df_source_M1_classified_anon.rds"))
df_media_b <- readRDS(file = paste0(wd_data,"/medium_twitter_llm_vs_emnlp/sample_df_source_M2_classified_anon.rds"))
df_media_c <- readRDS(file = paste0(wd_data,"/medium_twitter_llm_vs_emnlp/sample_df_source_M3_classified_anon.rds"))


#Prevalence 
df_ch %>% group_by(prediction) %>% summarise(n = n()) %>% mutate(f = n /sum(n))
df_ch %>% group_by(is_hatespeech) %>% summarise(n = n()) %>% mutate(f = n /sum(n))

df_media_a %>% group_by(prediction) %>% summarise(n = n()) %>% mutate(f = n /sum(n))
df_media_a %>% mutate(is_hatespeech = ifelse(hsprob >= 0.85, T, F)) %>% group_by(is_hatespeech) %>% summarise(n = n()) %>% mutate(f = n /sum(n))

df_media_b %>% group_by(prediction) %>% summarise(n = n()) %>% mutate(f = n /sum(n))
df_media_b %>% mutate(is_hatespeech = ifelse(hsprob >= 0.85, T, F)) %>% group_by(is_hatespeech) %>% summarise(n = n()) %>% mutate(f = n /sum(n))

df_media_c %>% group_by(prediction) %>% summarise(n = n()) %>% mutate(f = n /sum(n))
df_media_c %>% mutate(is_hatespeech = ifelse(hsprob >= 0.85, T, F)) %>% group_by(is_hatespeech) %>% summarise(n = n()) %>% mutate(f = n /sum(n))

# CH 
# ----------------------------------------------------------
df_long <- df_ch %>% mutate(prediction = ifelse(prediction == 1, T,F)) %>%
  pivot_longer(
    cols = c("prediction", "is_hatespeech"),        
    names_to = "classifier",                       
    values_to = "hs_label"                         
  )

df_plot_bin <- df_long %>%
  group_by(user_id, classifier, hs_label) %>%
  summarise(n = n(), .groups = "drop") %>%
  complete(user_id, classifier, hs_label, fill = list(n = 0)) %>%
  filter(hs_label == 1) %>%
  ungroup() %>%
  group_by(classifier) %>%
  mutate(f = n / sum(n)) %>%
  mutate(bin = ntile(desc(f), 100)) %>%
  group_by(classifier, bin) %>%
  summarise(
    f_sum  = sum(f),
    n_user = n_distinct(user_id),
    .groups = "drop"
  ) %>%
  arrange(classifier, bin)

agg_ch <- df_plot_bin
agg_ch <- agg_ch %>% mutate(source = "Twitter CH")

df_plot_bin$bin <- rep(seq(from = 100, to = 1, by = -1),2)

ch <- ggplot(df_plot_bin, aes(x = bin, y = f_sum, fill = classifier, color = classifier)) +
  geom_bar(
    stat = "identity",
    position = position_dodge(width = 0.9),  
    width = 0.8,                            
    alpha = 0.5,                            
  ) +
  scale_x_continuous(breaks = c(0,25,50,75,100)) +
  scale_y_continuous(
    labels = scales::percent,
    expand = c(0,0),
    breaks = seq(0,1,0.25),
    limits = c(0,0.5)
  ) +
  scale_color_manual(name = "Classifier",                                  
                     breaks = c("is_hatespeech", "prediction"),           
                     labels = c("is_hatespeech" = "EMNLP Classifier",     
                                "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
                     values = c("is_hatespeech" = "#D55E00",               
                                "prediction"    = "#0072B2")) +
  # Manual fill scale with your custom colors and labels
  scale_fill_manual(
    name = "Classifier",                                  
    breaks = c("is_hatespeech", "prediction"),            
    labels = c("is_hatespeech" = "EMNLP Classifier",     
               "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
    values = c("is_hatespeech" = "#D55E00",              
               "prediction"    = "#0072B2")) +
  labs(
    subtitle = "Twitter CH",
    x = "Percentiles of Hate-Speechiness",
    y = "Share HS by percentile"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    legend.position = "bottom",
    text = element_text(size = 18),
    plot.subtitle=element_text(hjust=0.5),
    plot.margin = margin(1,1,1.5,1.2, "cm")
  )

ch

# Media 1 
# ----------------------------------------------------------
df_long <- df_media_a %>% mutate(is_hatespeech = ifelse(hsprob >= 0.85, T, F),
                                 prediction = ifelse(prediction == 1, T,F)) %>%
  pivot_longer(
    cols = c("prediction", "is_hatespeech"),         
    names_to = "classifier",                         
    values_to = "hs_label"                          
  )

df_plot_bin <- df_long %>%
  group_by(user_id, classifier, hs_label) %>%
  summarise(n = n(), .groups = "drop") %>%
  complete(user_id, classifier, hs_label, fill = list(n = 0)) %>%
  filter(hs_label == 1) %>%
  ungroup() %>%
  group_by(classifier) %>%
  mutate(f = n / sum(n)) %>%
  mutate(bin = ntile(desc(f), 100)) %>%
  group_by(classifier, bin) %>%
  summarise(
    f_sum  = sum(f),
    n_user = n_distinct(user_id),
    .groups = "drop"
  ) %>%
  arrange(classifier, bin)

agg_m1 <- df_plot_bin
agg_m1 <- agg_m1 %>% mutate(source = "Medium 1")

df_plot_bin$bin <- rep(seq(from = 100, to = 1, by = -1),2)

m1 <- ggplot(df_plot_bin, aes(x = bin, y = f_sum, fill = classifier, color = classifier)) +
  geom_bar(
    stat = "identity",
    position = position_dodge(width = 0.9), 
    width = 0.8,                             
    alpha = 0.5,                            
  ) +
  scale_x_continuous(breaks = c(0,25,50,75,100)) +
  scale_y_continuous(
    labels = scales::percent,
    expand = c(0,0),
    breaks = seq(0,1,0.25),
    limits = c(0,0.5)
  ) +
  scale_color_manual(name = "Classifier",                                 
                     breaks = c("is_hatespeech", "prediction"),           
                     labels = c("is_hatespeech" = "EMNLP Classifier",      
                                "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
                     values = c("is_hatespeech" = "#D55E00",               
                                "prediction"    = "#0072B2")) +
  # Manual fill scale with your custom colors and labels
  scale_fill_manual(
    name = "Classifier",                                 
    breaks = c("is_hatespeech", "prediction"),           
    labels = c("is_hatespeech" = "EMNLP Classifier",     
               "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
    values = c("is_hatespeech" = "#D55E00",               
               "prediction"    = "#0072B2")) +
  labs(
    subtitle = "Medium 1",
    x = "Percentiles of Hate-Speechiness",
    y = "Share HS by percentile"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    legend.position = "bottom",
    text = element_text(size = 18),
    plot.subtitle=element_text(hjust=0.5),
    plot.margin = margin(1,1,1.5,1.2, "cm")
  )

m1


# Media 2
# ----------------------------------------------------------
df_long <- df_media_b %>% mutate(is_hatespeech = ifelse(hsprob >= 0.85, T, F),
                                 prediction = ifelse(prediction == 1, T,F)) %>%
  pivot_longer(
    cols = c("prediction", "is_hatespeech"),     
    names_to = "classifier",                        
    values_to = "hs_label"                          
  )

df_plot_bin <- df_long %>%
  group_by(user_id, classifier, hs_label) %>%
  summarise(n = n(), .groups = "drop") %>%
  complete(user_id, classifier, hs_label, fill = list(n = 0)) %>%
  filter(hs_label == 1) %>%
  ungroup() %>%
  group_by(classifier) %>%
  mutate(f = n / sum(n)) %>%
  mutate(bin = ntile(desc(f), 100)) %>%
  group_by(classifier, bin) %>%
  summarise(
    f_sum  = sum(f),
    n_user = n_distinct(user_id),
    .groups = "drop"
  ) %>%
  arrange(classifier, bin)

agg_m2 <- df_plot_bin
agg_m2 <- agg_m2 %>% mutate(source = "Medium 2")

df_plot_bin$bin <- rep(seq(from = 100, to = 1, by = -1),2)

m2 <- ggplot(df_plot_bin, aes(x = bin, y = f_sum, fill = classifier, color = classifier)) +
  geom_bar(
    stat = "identity",
    position = position_dodge(width = 0.9),  
    width = 0.8,                            
    alpha = 0.5,                            
  ) +
  scale_x_continuous(breaks = c(0,25,50,75,100)) +
  scale_y_continuous(
    labels = scales::percent,
    expand = c(0,0),
    breaks = seq(0,1,0.25),
    limits = c(0,0.9)
  ) +
  scale_color_manual(name = "Classifier",                                 
                     breaks = c("is_hatespeech", "prediction"),           
                     labels = c("is_hatespeech" = "EMNLP Classifier",    
                                "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
                     values = c("is_hatespeech" = "#D55E00",               
                                "prediction"    = "#0072B2")) +
  # Manual fill scale with your custom colors and labels
  scale_fill_manual(
    name = "Classifier",                                  
    breaks = c("is_hatespeech", "prediction"),           
    labels = c("is_hatespeech" = "EMNLP Classifier",    
               "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
    values = c("is_hatespeech" = "#D55E00",              
               "prediction"    = "#0072B2")) +
  labs(
    subtitle = "Medium 2",
    x = "Percentiles of Hate-Speechiness",
    y = "Share HS by percentile"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    legend.position = "bottom",
    text = element_text(size = 18),
    plot.subtitle=element_text(hjust=0.5),
    plot.margin = margin(1,1,1.5,1.2, "cm")
  )

m2

# Media 3
# ----------------------------------------------------------
df_long <- df_media_c %>% mutate(is_hatespeech = ifelse(hsprob >= 0.85, T, F),
                                 prediction = ifelse(prediction == 1, T,F)) %>%
  pivot_longer(
    cols = c("prediction", "is_hatespeech"),        
    names_to = "classifier",                         
    values_to = "hs_label"                          
  )

df_plot_bin <- df_long %>%
  group_by(user_id, classifier, hs_label) %>%
  summarise(n = n(), .groups = "drop") %>%
  complete(user_id, classifier, hs_label, fill = list(n = 0)) %>%
  filter(hs_label == 1) %>%
  ungroup() %>%
  group_by(classifier) %>%
  mutate(f = n / sum(n)) %>%
  mutate(bin = ntile(desc(f), 100)) %>%
  group_by(classifier, bin) %>%
  summarise(
    f_sum  = sum(f),
    n_user = n_distinct(user_id),
    .groups = "drop"
  ) %>%
  arrange(classifier, bin)

agg_m3 <- df_plot_bin
agg_m3 <- agg_m3 %>% mutate(source = "Medium 3")

df_plot_bin$bin <- rep(seq(from = 100, to = 1, by = -1),2)

m3 <- ggplot(df_plot_bin, aes(x = bin, y = f_sum, fill = classifier, color = classifier)) +
  geom_bar(
    stat = "identity",
    position = position_dodge(width = 0.9),  
    width = 0.8,                             
    alpha = 0.5,                          
  ) +
  scale_x_continuous(breaks = c(0,25,50,75,100)) +
  scale_y_continuous(
    labels = scales::percent,
    expand = c(0,0),
    breaks = seq(0,1,0.25),
    limits = c(0,0.8)
  ) +
  scale_color_manual(name = "Classifier",                                 
                     breaks = c("is_hatespeech", "prediction"),           
                     labels = c("is_hatespeech" = "EMNLP Classifier",      
                                "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
                     values = c("is_hatespeech" = "#D55E00",               
                                "prediction"    = "#0072B2")) +
  # Manual fill scale with your custom colors and labels
  scale_fill_manual(
    name = "Classifier",                                
    breaks = c("is_hatespeech", "prediction"),           
    labels = c("is_hatespeech" = "EMNLP Classifier",     
               "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
    values = c("is_hatespeech" = "#D55E00",               
               "prediction"    = "#0072B2")) +
  labs(
    subtitle = "Medium 2",
    x = "Percentiles of Hate-Speechiness",
    y = "Share HS by percentile"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    legend.position = "bottom",
    text = element_text(size = 18),
    plot.subtitle=element_text(hjust=0.5),
    plot.margin = margin(1,1,1.5,1.2, "cm")
  )

m3
############################################################
# Combined Plot
############################################################
agg <- dplyr::bind_rows(agg_ch,agg_m1,agg_m2,agg_m3)

agg$bin <- rep(seq(from = 100, to = 1, by = -1),8)

agg <- agg %>% mutate(
  source = factor(
    source, 
    levels = c("Twitter CH", "Medium 1", "Medium 2", "Medium 3")
  )
)

all <- ggplot(agg, aes(x = bin, y = f_sum, fill = classifier, color = classifier)) +
  geom_bar(
    stat = "identity",
    position = position_dodge(width = 0.9),  
    width = 0.8,                            
    alpha = 0.5,                           
  ) +
  facet_wrap(~source, ncol = 2, scales = "fixed") +
  scale_x_continuous(breaks = c(0,25,50,75,100)) +
  scale_y_continuous(
    labels = scales::percent,
    expand = c(0,0),
    breaks = seq(0,1,0.25),
    limits = c(0,0.8)
  ) +
  scale_color_manual(name = "Classifier",                                 
                     breaks = c("is_hatespeech", "prediction"),           
                     labels = c("is_hatespeech" = "EMNLP Classifier",     
                                "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
                     values = c("is_hatespeech" = "#D55E00",              
                                "prediction"    = "#0072B2")) +
  # Manual fill scale with your custom colors and labels
  scale_fill_manual(
    name = "Classifier",                                 
    breaks = c("is_hatespeech", "prediction"),          
    labels = c("is_hatespeech" = "EMNLP Classifier",      
               "prediction"    = "Fine-Tuned DeepSeek R1 32b"),
    values = c("is_hatespeech" = "#D55E00",              
               "prediction"    = "#0072B2")) +
  labs(
    x = "Percentiles of Hate-Speechiness",
    y = "Share HS by percentile"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    legend.position = "bottom",
    text = element_text(size = 18),
    plot.subtitle=element_text(hjust=0.5),
    plot.margin = margin(1,1,1.5,1.2, "cm")
  )


all
ggsave(paste0(wd_res,"/figures/figD6.png"), all, width = 12, height = 12, dpi = 400, bg = "white")
cat("\n====================\n")
cat("Saved Figure D6")
cat("\n====================\n")
